1 Load environment

Code
import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools

from tqdm import tqdm

import decoupler as dc
import sys

sys.path.append("./../../../../utilities_folder")
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors

Set R environment with rpy2:

Code
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
import rpy2.robjects as ro

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

Set up of graphical parameters for Python plots:

Code
%matplotlib inline
sc.set_figure_params(dpi = 300, fontsize = 20)

cmap_up = sns.light_palette("red", as_cmap=True)
cmap_down = sns.light_palette("blue", as_cmap=True)
cmap_all = sns.light_palette("seagreen", as_cmap=True)

Set up of graphical parameters for R plots:

Code
default_units = 'in' 
default_res = 300
default_width = 10
default_height = 9

import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics

def new_setup_graphics(self, args):
    if getattr(args, 'units') is not None:
        if args.units != default_units:
            return old_setup_graphics(self, args)
    args.units = default_units
    if getattr(args, 'res') is None:
        args.res = default_res
    if getattr(args, 'width') is None:
        args.width = default_width
    if getattr(args, 'height') is None:
        args.height = default_height        
    return old_setup_graphics(self, args)


rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics

Here the cell were we inject the parameters using Quarto renderer:

Code
# Injected Parameters
N = 3
Code
# Injected Parameters
N = 2

Import R libraries:

Code
%%R
source('./../../../../utilities_folder/GO_helper.r')
loc <- './../../../../R_loc' # pointing to the renv environment

.libPaths(loc)

library('topGO')
library('org.Hs.eg.db')
library(dplyr)
library(ggplot2)

Set output folders:

Code
output_folder = './'
folder = './deg_in_cluster_tables/cluster_' + str(N) + '/'

import os

if not os.path.isdir(folder):
    os.makedirs(folder)

2 Load data

Here we load the anndata:

Code
GO2gene = load_object('./../../../../data/GO2gene_complete.pickle')

Here we load the dictionary that associates to each GO term its genes:

Code
adata = sc.read("1_All_Annotated_Triku.h5ad")
adata
AnnData object with n_obs × n_vars = 27925 × 14582
    obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'gene_UMI_ratio', 'log1p_gene_UMI_ratio', 'scDblFinder_score', 'scDblFinder_class', 'n_counts', 'n_genes', 'leiden', 'score_pluripotency', 'score_mesoderm', 'score_PS', 'score_EM', 'score_AM', 'score_Endo', 'score_Epi', 'score_HEP', 'score_NM', 'score_ExM', 'score_Ery', 'score_EAE', 'score_Amnion_LC', 'score_AxM', 'score_Amnion', 'score_NNE', 'score_PGC', 'score_hPGCLCs', 'score_DE1', 'score_DE2', 'score_Hypoblast', 'score_YS', 'score_EMPs', 'score_Endothelium', 'score_Myeloid_Progenitors', 'score_Megakaryocyte_Erythroid_progenitors', 'CellTypes'
    var: 'n_cells', 'mito', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'triku_distance', 'triku_distance_uncorrected', 'triku_highly_variable'
    uns: 'CellTypes_colors', 'dendrogram_CellTypes', 'dendrogram_leiden', 'dendrogram_specimen', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'rank_genes_groups', 'run_id_colors', 'sample_id_colors', 'specimen_colors', 'stage_colors', 'triku_params', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'raw'
    obsp: 'connectivities', 'distances'

3 Markers of cluster

We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):

Code
markers = sc.get.rank_genes_groups_df(adata, group=str(N))
intTable(markers, folder=folder, fileName = 'markers_cluster_before_filtering_' + str(N) + '.xlsx', save = True)
Code
markers = markers.set_index('names')
markers = markers[markers.pvals_adj < 0.01]
markers['FDR'] = markers['pvals_adj']
markers['-log10(FDR)'] = -np.log10(markers.pvals_adj)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)'] != np.inf]['-log10(FDR)'].max())
markers
scores logfoldchanges pvals pvals_adj pct_nz_group pct_nz_reference FDR -log10(FDR)
names
COL4A2 93.131279 4.240220 0.0 0.0 1.000000 0.889565 0.0 309.013678
COL4A1 92.239578 4.473654 0.0 0.0 0.999710 0.859372 0.0 309.013678
ZEB2 91.778168 6.360259 0.0 0.0 0.994781 0.168001 0.0 309.013678
COL3A1 91.229744 8.745208 0.0 0.0 1.000000 0.238765 0.0 309.013678
MAN1A1 90.926048 5.642728 0.0 0.0 0.996521 0.312592 0.0 309.013678
... ... ... ... ... ... ... ... ...
SLC7A5 -82.212738 -4.583455 0.0 0.0 0.380400 0.923803 0.0 309.013678
CLDN6 -83.345406 -7.369551 0.0 0.0 0.140041 0.895735 0.0 309.013678
CDH3 -83.556168 -4.574719 0.0 0.0 0.246448 0.929114 0.0 309.013678
SLC38A1 -84.280098 -4.575012 0.0 0.0 0.294578 0.935284 0.0 309.013678
PODXL -88.903313 -6.873848 0.0 0.0 0.209336 0.957019 0.0 309.013678

11525 rows × 8 columns

Code
intTable(markers, folder=folder, fileName = 'markers_cluster_filtered_' + str(N) + '.xlsx', save = True)

3.1 Select up and downregulated markers

3.1.1 Up

Code
pos = markers[markers.logfoldchanges > 1]
pos
scores logfoldchanges pvals pvals_adj pct_nz_group pct_nz_reference FDR -log10(FDR)
names
COL4A2 93.131279 4.240220 0.000000 0.000000 1.000000 0.889565 0.000000 309.013678
COL4A1 92.239578 4.473654 0.000000 0.000000 0.999710 0.859372 0.000000 309.013678
ZEB2 91.778168 6.360259 0.000000 0.000000 0.994781 0.168001 0.000000 309.013678
COL3A1 91.229744 8.745208 0.000000 0.000000 1.000000 0.238765 0.000000 309.013678
MAN1A1 90.926048 5.642728 0.000000 0.000000 0.996521 0.312592 0.000000 309.013678
... ... ... ... ... ... ... ... ...
RAI2 2.689245 1.167971 0.007161 0.009085 0.055668 0.027701 0.009085 2.041657
GRAP2 2.687262 3.175457 0.007204 0.009135 0.031893 0.003677 0.009135 2.039304
ZNF575 2.676554 1.543274 0.007438 0.009427 0.044361 0.016302 0.009427 2.025631
SLC45A2 2.666203 2.808888 0.007671 0.009716 0.032473 0.004453 0.009716 2.012500
CHI3L1 2.660832 1.351641 0.007795 0.009868 0.046100 0.018140 0.009868 2.005756

1802 rows × 8 columns

Code
pos['-log10(FDR)'] = -np.log10(pos.pvals_adj)
#pos = pos.fillna('inf')
pos = pos.replace(np.inf, pos[pos['-log10(FDR)'] != np.inf]['-log10(FDR)'].max())
pos['FDR'] = pos['pvals_adj']
up_regGenes = pos.index.tolist()
allUpSelected = adata.var_names.isin(up_regGenes).astype('int').tolist()

3.1.2 Down

Code
neg = markers[markers.logfoldchanges < -1].sort_values(by = 'logfoldchanges')
neg
scores logfoldchanges pvals pvals_adj pct_nz_group pct_nz_reference FDR -log10(FDR)
names
FOXB1 -11.905676 -27.653414 1.105636e-32 2.346097e-32 0.000000 0.125020 2.346097e-32 31.629654
SLC52A3 -12.438708 -27.555775 1.610885e-35 3.516982e-35 0.000000 0.130618 3.516982e-35 34.453830
KCNJ11 -11.820080 -27.438763 3.073928e-32 6.493410e-32 0.000000 0.124122 6.493410e-32 31.187527
BICDL2 -11.742265 -27.394825 7.738591e-32 1.629518e-31 0.000000 0.123304 1.629518e-31 30.787941
LGALS14 -6.874944 -26.727528 6.201408e-12 1.009251e-11 0.000000 0.072193 1.009251e-11 10.996001
... ... ... ... ... ... ... ... ...
DESI2 -32.305519 -1.001727 5.851790e-229 3.426940e-228 0.660191 0.824359 3.426940e-228 227.465094
NAP1L4 -40.579399 -1.001709 0.000000e+00 0.000000e+00 0.807770 0.925764 0.000000e+00 309.013678
RAB5IF -31.241621 -1.001236 2.901493e-214 1.633574e-213 0.621919 0.798905 1.633574e-213 212.786861
C9orf16 -34.725502 -1.000740 3.248261e-264 2.191862e-263 0.733256 0.873958 2.191862e-263 262.659187
CBLL1 -21.161049 -1.000238 2.183095e-99 7.333308e-99 0.358075 0.549109 7.333308e-99 98.134700

3637 rows × 8 columns

Code
neg['-log10(FDR)'] = -np.log10(neg.pvals_adj)
#neg = neg.fillna('inf')
neg = neg.replace(np.inf, neg[neg['-log10(FDR)'] != np.inf]['-log10(FDR)'].max())
neg['FDR'] = neg['pvals_adj']
down_regGenes = neg.index.tolist()
allDownSelected = adata.var_names.isin(down_regGenes).astype('int').tolist()

3.1.3 All regulated

Code
all_sign = up_regGenes + down_regGenes
allSelected = adata.var_names.isin(all_sign).astype('int').tolist()
allGenes = adata.var_names.tolist()

4 topGO

4.1 Upregulated

Code
%%R -i allUpSelected -i allGenes

allGenes_v <- c(allUpSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)

geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

selection <- function(allScores){return (as.logical(allScores))}

Get topGO results:

Code
%%R -o results

GOdata <- new("topGOdata",
  ontology="BP",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_BP,
  geneSel = selection,
  nodeSize=10)

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Convert results from R environment to Python:

Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([13071,  1638,    10,  5722], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

results_table_py = ro.conversion.rpy2py(results_table)

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py
GO.ID Term Annotated Significant Expected weight Scores
0 GO:0030199 collagen fibril organization 60 37 7.52 3.7e-19 3.692981e-19
1 GO:0030198 extracellular matrix organization 267 120 33.46 3.3e-17 3.278726e-17
2 GO:0001525 angiogenesis 450 137 56.39 4.0e-11 4.019062e-11
3 GO:0007155 cell adhesion 1176 283 147.37 1.1e-09 1.142650e-09
4 GO:0045766 positive regulation of angiogenesis 139 45 17.42 1.4e-09 1.440942e-09
... ... ... ... ... ... ... ...
6269 GO:2000601 positive regulation of Arp2/3 complex-me... 10 0 1.25 1.00000 1.000000e+00
6270 GO:2000765 regulation of cytoplasmic translation 26 0 3.26 1.00000 1.000000e+00
6271 GO:2000767 positive regulation of cytoplasmic trans... 13 0 1.63 1.00000 1.000000e+00
6272 GO:2001138 regulation of phospholipid transport 12 0 1.50 1.00000 1.000000e+00
6273 GO:2001140 positive regulation of phospholipid tran... 11 0 1.38 1.00000 1.000000e+00

6274 rows × 7 columns

Filter results:

Code
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
Code
intTable(results_table_py, folder = folder, fileName = 'GO_BP_up.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
bubbleplot(Res, Ont = 'BP', fillCol = 'red')

Code
%%R -i markers
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='red', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_BP_genesInTerm_up.xlsx'), SE = markers)
Code
%%R

GOdata <- new("topGOdata",
  ontology="MF",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_MF,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([13424,  1667,    10,   999], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_MF_up.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
bubbleplot(Res, Ont = 'MF', fillCol = 'red')

Code
%%R -i markers
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='red', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_MF_genesInTerm_up.xlsx'), SE = markers)
Code
%%R

GOdata <- new("topGOdata",
  ontology="CC",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_CC,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([13666,  1716,    10,   656], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
Code
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_CC_up.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
bubbleplot(Res, Ont = 'CC', fillCol = 'red')

Code
%%R -i markers
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='red', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_CC_genesInTerm_up.xlsx'), SE = markers)

4.1.0.1 Reactome

Code
curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
Code
rank = pd.DataFrame(markers['-log10(FDR)'])

rank_copy = rank.copy()
rank_copy['pval'] = markers.loc[rank.index].FDR
Code
rank_copy
-log10(FDR) pval
names
COL4A2 309.013678 0.0
COL4A1 309.013678 0.0
ZEB2 309.013678 0.0
COL3A1 309.013678 0.0
MAN1A1 309.013678 0.0
... ... ...
SLC7A5 309.013678 0.0
CLDN6 309.013678 0.0
CDH3 309.013678 0.0
SLC38A1 309.013678 0.0
PODXL 309.013678 0.0

11525 rows × 2 columns

Code
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
len(results_table_py)
871
Code
intTable(results_table_py, folder = folder, fileName = 'Reactome_up.xlsx', save = True)
Code
if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways'], rank_copy)
    _, df = plotGenesInTerm(results = results_table_py, 
                            GO2gene = GO2gene['reactome_pathways'], DEGs = rank_copy, n_top_terms = 10, cmap = cmap_up)

Code
if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_up.xlsx', save = True)

4.1.0.2 KEGG

Code
curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
Code
intTable(results_table_py, folder = folder, fileName = 'KEGG_up.xlsx', save = True)
Code
if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways'], rank_copy)
    _, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_up)

Code
if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_up.xlsx', save = True)

4.2 Downregulated

Code
%%R -i allDownSelected -i allGenes

allGenes_v <- c(allDownSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)

geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

selection <- function(allScores){return (as.logical(allScores))}
Code
%%R
#print(lapply(ann_org_BP, count_genes))

GOdata <- new("topGOdata",
  ontology="BP",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_BP,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([13071,  3275,    10,  6197], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py
GO.ID Term Annotated Significant Expected weight Scores
0 GO:0051301 cell division 575 202 144.07 8.6e-11 8.572892e-11
1 GO:0006268 DNA unwinding involved in DNA replicatio... 22 18 5.51 3.6e-08 3.642153e-08
2 GO:0006271 DNA strand elongation involved in DNA re... 15 13 3.76 9.3e-07 9.334978e-07
3 GO:0007094 mitotic spindle assembly checkpoint sign... 46 30 11.53 7.2e-06 7.184653e-06
4 GO:1900264 positive regulation of DNA-directed DNA ... 13 11 3.26 1.1e-05 1.118943e-05
... ... ... ... ... ... ... ...
6269 GO:1904380 endoplasmic reticulum mannose trimming 13 0 3.26 1.00000 1.000000e+00
6270 GO:1904953 Wnt signaling pathway involved in midbra... 10 0 2.51 1.00000 1.000000e+00
6271 GO:1905038 regulation of membrane lipid metabolic p... 12 0 3.01 1.00000 1.000000e+00
6272 GO:2000303 regulation of ceramide biosynthetic proc... 11 0 2.76 1.00000 1.000000e+00
6273 GO:2000826 regulation of heart morphogenesis 10 0 2.51 1.00000 1.000000e+00

6274 rows × 7 columns

Code
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]
Code
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
Code
intTable(results_table_py, folder = folder, fileName = 'GO_BP_down.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
bubbleplot(Res, Ont = 'BP', fillCol = 'blue')

Code
%%R -i markers
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='blue', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_BP_genesInTerm_down.xlsx'), SE = markers)
Code
%%R

GOdata <- new("topGOdata",
  ontology="MF",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_MF,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([13424,  3348,    10,  1138], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_MF_down.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
bubbleplot(Res, Ont = 'MF', fillCol = 'blue')

Code
%%R -i markers
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='blue', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_MF_genesInTerm_down.xlsx'), SE = markers)
Code
%%R

GOdata <- new("topGOdata",
  ontology="CC",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_CC,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([13666,  3419,    10,   789], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
Code
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_CC_down.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
bubbleplot(Res, Ont = 'CC', fillCol = 'blue')

Code
%%R -i markers
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='blue', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_CC_genesInTerm_down.xlsx'), SE = markers)

4.2.0.1 Reactome

Code
curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
Code
rank = pd.DataFrame(markers['-log10(FDR)'])

rank_copy = rank.copy()
rank_copy['pval'] = markers.loc[rank.index].FDR
Code
rank_copy
-log10(FDR) pval
names
COL4A2 309.013678 0.0
COL4A1 309.013678 0.0
ZEB2 309.013678 0.0
COL3A1 309.013678 0.0
MAN1A1 309.013678 0.0
... ... ...
SLC7A5 309.013678 0.0
CLDN6 309.013678 0.0
CDH3 309.013678 0.0
SLC38A1 309.013678 0.0
PODXL 309.013678 0.0

11525 rows × 2 columns

Code
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
len(results_table_py)
871
Code
intTable(results_table_py, folder = folder, fileName = 'Reactome_down.xlsx', save = True)
Code
if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways'], rank_copy)
    _, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways'], DEGs = rank_copy, n_top_terms = 10, cmap = cmap_down)

Code
if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_down.xlsx', save = True)

4.2.0.2 KEGG

Code
curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
Code
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
Code
intTable(results_table_py, folder = folder, fileName = 'KEGG_down.xlsx', save = True)
Code
if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways'], rank_copy)
    _, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_down)

Code
if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_down.xlsx', save = True)

4.3 All significant

Code
%%R -i allSelected -i allGenes

allGenes_v <- c(allUpSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)

geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

selection <- function(allScores){return (as.logical(allScores))}
Code
%%R
#print(lapply(ann_org_BP, count_genes))

GOdata <- new("topGOdata",
  ontology="BP",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_BP,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([13071,  1638,    10,  5722], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py
GO.ID Term Annotated Significant Expected weight Scores
0 GO:0030199 collagen fibril organization 60 37 7.52 3.7e-19 3.692981e-19
1 GO:0030198 extracellular matrix organization 267 120 33.46 3.3e-17 3.278726e-17
2 GO:0001525 angiogenesis 450 137 56.39 4.0e-11 4.019062e-11
3 GO:0007155 cell adhesion 1176 283 147.37 1.1e-09 1.142650e-09
4 GO:0045766 positive regulation of angiogenesis 139 45 17.42 1.4e-09 1.440942e-09
... ... ... ... ... ... ... ...
6269 GO:2000601 positive regulation of Arp2/3 complex-me... 10 0 1.25 1.00000 1.000000e+00
6270 GO:2000765 regulation of cytoplasmic translation 26 0 3.26 1.00000 1.000000e+00
6271 GO:2000767 positive regulation of cytoplasmic trans... 13 0 1.63 1.00000 1.000000e+00
6272 GO:2001138 regulation of phospholipid transport 12 0 1.50 1.00000 1.000000e+00
6273 GO:2001140 positive regulation of phospholipid tran... 11 0 1.38 1.00000 1.000000e+00

6274 rows × 7 columns

Code
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]
Code
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
Code
intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')

Code
%%R -i markers
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_BP_genesInTerm_all.xlsx'), SE = markers)
Code
%%R

GOdata <- new("topGOdata",
  ontology="MF",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_MF,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([13424,  1667,    10,   999], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

Code
%%R -i markers
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_MF_genesInTerm_all.xlsx'), SE = markers)
Code
%%R

GOdata <- new("topGOdata",
  ontology="CC",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_CC,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([13666,  1716,    10,   656], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
Code
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

Code
%%R -i markers
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_CC_genesInTerm_all.xlsx'), SE = markers)

4.3.0.1 Reactome

Code
curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
Code
rank = pd.DataFrame(markers['-log10(FDR)'])

rank_copy = rank.copy()
rank_copy['pval'] = markers.loc[rank.index].FDR
Code
rank_copy
-log10(FDR) pval
names
COL4A2 309.013678 0.0
COL4A1 309.013678 0.0
ZEB2 309.013678 0.0
COL3A1 309.013678 0.0
MAN1A1 309.013678 0.0
... ... ...
SLC7A5 309.013678 0.0
CLDN6 309.013678 0.0
CDH3 309.013678 0.0
SLC38A1 309.013678 0.0
PODXL 309.013678 0.0

11525 rows × 2 columns

Code
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
len(results_table_py)
871
Code
intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx', save = True)
Code
if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways'], rank_copy)
    _, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways'], DEGs = rank_copy, n_top_terms = 10, cmap = cmap_all)

Code
if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx', save = True)

4.3.0.2 KEGG

Code
curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
Code
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
Code
intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx', save = True)
Code
if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways'], rank_copy)
    _, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_all)

Code
if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx', save = True)